For background information on the nature of the funds, see the Wikipedia page.
from pandas import read_csv
# The first 5 rows are headers (French and English) + junk
df = read_csv('/home/loic/Downloads/france-openspending.csv',
names=['program_name',
'project_name',
'recipient',
'recipient_postcode',
'project_start_date',
'project_end_date',
'amount_eligible',
'amount_provisionned',
'operation_postcode',
'zone',
'district',
'region',
'country',
'funding_category',
'acceptance_date',
'fund']).ix[5:].reset_index(drop=True)
df.head()
from pandas import to_datetime
date_columns = ['project_start_date', 'project_end_date', 'acceptance_date']
for column in date_columns:
df[column] = to_datetime(df[column])
df.info()
def parse_euros(amount):
return float(amount.replace(' €', '').replace(',', ''))
# Try the function out
print(parse_euros('286,000.00 €'), '\n')
currency_columns = ['amount_eligible', 'amount_provisionned']
for column in currency_columns:
# The 'amount_eligible' colummn has one float value:
df[column] = df[column].apply(str).apply(parse_euros)
df.info()
from pandas import options
options.display.float_format = '{:,.0f} K€'.format
amounts = df[['amount_eligible', 'amount_provisionned']].apply(lambda x: x / 1000)
amounts.describe().ix[1:]
print('total eligible: %.0f M€' % (amounts['amount_eligible'].sum() / 1000))
print('total provisionned: %.0f M€' % (amounts['amount_provisionned'].sum() / 1000))
Take away points:
top subsidies
sorted_amounts = amounts.sort_values(by='amount_provisionned', ascending=False)
sorted_amounts.head(10)
df[['recipient', 'amount_provisionned', 'project_name', 'fund']].ix[sorted_amounts.head(10).index]
from pandas import cut
bins = [0, 1000, 2000,
3000, 4000,
5000, 6000,
7000, 8000,
9000, 10000,
11000, 12000,
13000, 14000,
15000, 20000]
provisionned_bins = cut(amounts['amount_provisionned'], bins)
provisionned_binned = amounts.groupby(provisionned_bins).agg(sum)
provisionned_binned.index.names = ['bins']
provisionned_binned.fillna(0)
from bokeh.charts import Scatter, Histogram, Bar, output_notebook, show
from bokeh.models import PrintfTickFormatter
output_notebook()
# Work-around to get the labels in order
provisionned_binned['labels'] = [
'a (0, 1000]',
'b (1000, 2000]',
'c (2000, 3000]',
'd (3000, 4000]',
'e (4000, 5000]',
'f (5000, 6000]',
'g (6000, 7000]',
'h (7000, 8000]',
'i (8000, 9000]',
'j (9000, 10000]',
'k (10000, 11000]',
'l (11000, 12000]',
'm (12000, 13000]',
'o (13000, 14000]',
'p (14000, 15000]',
'q (15000, 20000]'
]
bar = Bar(provisionned_binned,
values='amount_provisionned',
label='labels',
ylabel='Total subsidy spending (K€)',
title='Spending across project size (K€)')
show(bar)
amounts = amounts.sort_values(by='amount_provisionned')
amounts['cumsum'] = amounts['amount_provisionned'].cumsum()
amounts['fraction_of_total'] = amounts['cumsum'] / amounts['amount_provisionned'].sum()
amounts.head()
p = Scatter(amounts, x='amount_provisionned', y='fraction_of_total')
show(p)
Take away point: 80% of subsidies are spent on projects < 5M€
options.display.float_format = '{:,.2f}'.format
amounts.dropna(inplace=True)
amounts.where(amounts['amount_eligible'] != 0, inplace=True)
amounts['ratio'] = amounts['amount_provisionned'] / amounts['amount_eligible'] * 100
s = Scatter(amounts, y='ratio', x='amount_provisionned',
ylabel="Ratio amount programmed / eligible (%)", xlabel="Amount programmed (K€)")
show(s)
funds = df['fund'].value_counts()
funds.index.name = 'Fund'
funds.name = 'number_of_subsidies'
funds
b = Bar(funds, ylabel='Number of grants')
show(b)
df['funding_category'].value_counts()
df['short_category'] = df['funding_category'].apply(lambda x: str(x)[:8])
b = Bar(df, values='amount_provisionned', agg='sum', stack='short_category', label='fund', legend='top_right')
show(b)
df['departements'] = df['recipient_postcode'].apply(str).apply(lambda x: x[:2])
geo = df.groupby('departements').agg('sum')
geo.dropna(inplace=True)
geo.index.names = ['departements']
geo.info()
geo.sort_values(by='amount_eligible', inplace=True, ascending=False)
geo.head()
# Test the map
from folium import Map, GeoJson
import pandas as pd
departements = r'/home/loic/repos/eijc16/departements.json'
#Let Folium determine the scale
france = Map(location=[47, 2], zoom_start=6)
GeoJson(open(departements), name='geojson').add_to(france)
france
geo.reset_index(level=0, inplace=True)
geo.sort_values(by='departements').head()
from pandas import DataFrame, concat
# fill in missing departements (for folium)
for dep in [str(n).zfill(2) for n in range(1, 96)]:
if dep not in geo['departements'].values:
missing_district = DataFrame({'departements': [str(dep)], 'amount_eligible': [0], 'amount_provisionned': [0]})
geo = concat([geo, missing_district])
geo.where(geo['departements'] != 'na', inplace=True)
geo.dropna(inplace=True)
geo.departements.values
# Corsica: dude this map is picky
corsica = DataFrame({'departements': ['2A', '2B'], 'amount_eligible': [0, 0], 'amount_provisionned': [0, 0]})
geo = concat([geo, corsica])
geo.tail()
from math import log10
geo['amount_provisionned'] = geo['amount_provisionned'].replace(0, 0.1)
geo['scale'] = geo['amount_provisionned'].apply(log10)
france.choropleth(
departements,
data=geo,
columns=['departements', 'scale'],
key_on='properties.code',
fill_color='YlGn', fill_opacity=0.7, line_opacity=0.2,
legend_name='Subsidies')
france